import pandas as pd
import seaborn as sns
import numpy as np
from matplotlib import pyplot as plt
from datetime import timedelta
from pandas.plotting import scatter_matrix
%matplotlib inline
The first dataset is an export of my ride data from Strava, an online social
network site for cycling and other sports. This data is a log of every ride since the start of 2018
and contains summary data like the distance and average speed. It was exported using
the script stravaget.py which uses the stravalib module to read data. Some details of
the fields exported by that script can be seen in the documentation for stravalib.
The exported data is a CSV file so that's easy to read, however the date information in the file is recorded in a different timezone (UTC) so we need to do a bit of conversion. In reading the data I'm setting the index of the data frame to be the datetime of the ride.
strava = pd.read_csv('data/strava_export.csv', index_col='date', parse_dates=True)
strava.index = strava.index.tz_convert('Australia/Sydney')
strava.head()
The second dataset comes from an application called GoldenCheetah which provides some analytics services over ride data. This has some of the same fields but adds a lot of analysis of the power, speed and heart rate data in each ride. This data overlaps with the Strava data but doesn't include all of the same rides.
Again we create an index using the datetime for each ride, this time combining two columns in the data (date and time) and localising to Sydney so that the times match those for the Strava data.
cheetah = pd.read_csv('data/cheetah.csv', skipinitialspace=True)
cheetah.index = pd.to_datetime(cheetah['date'] + ' ' + cheetah['time'])
cheetah.index = cheetah.index.tz_localize('Australia/Sydney')
cheetah.head()
The GoldenCheetah data contains many many variables (columns) and I won't go into all of them here. Some that are of particular interest for the analysis below are:
Here are definitions of some of the more important fields in the data. Capitalised fields come from the GoldenCheetah data while lowercase_fields come from Strava. There are many cases where fields are duplicated and in this case the values should be the same, although there is room for variation as the algorithm used to calculate them could be different in each case.
Some of the GoldenCheetah parameters are defined in thier documentation.
Your first task is to combine these two data frames using the join method of Pandas. The goal is to keep only those rows of data
that appear in both data frames so that we have complete data for every row.
table_merge=pd.merge(cheetah, strava, left_index=True, right_index=True, how='inner')
table_merge.head()
not_MTB=table_merge[table_merge.device_watts!=False] #remove device_watts = False
not_MTB['Time Moving']=not_MTB['Time Moving']/60 #convert Time Moving to minutes
not_MTB['Duration']=not_MTB['Duration']/60 #convert Time Moving to minutes
#create new DataFrame for data analysis
new_MTB=not_MTB[['Average Heart Rate', 'Average Speed', 'Average Power', 'distance', 'Elevation Gain', 'Time Moving', 'TSS', 'Duration', 'workout_type']]
new_MTB=new_MTB.sort_values(['distance', 'TSS'])
new_MTB.columns = ['Average Heart Rate', 'Average Speed', 'Average Power', 'Distance', 'Elevation Gain', 'Time Moving', 'TSS', 'Duration', 'Workout Type']
fig, axes = plt.subplots(1, 2) #two plots side by side
# Distribution Plot (a.k.a. Histogram)
sns.distplot(new_MTB['Average Power'],
ax=axes[0])
sns.distplot(new_MTB['Average Speed'],
ax=axes[1])
fig, axes = plt.subplots(1, 3) #two plots side by side
# Distribution Plot (a.k.a. Histogram)
sns.distplot(new_MTB['Distance'],
ax=axes[0])
sns.distplot(new_MTB['Duration'],
ax=axes[1])
sns.distplot(new_MTB['TSS'],
ax=axes[2])
%run scripts/PearsonValue.py
g = sns.pairplot(new_MTB,
palette = 'husl')
g.map_upper(corr)
plt.show()
sns.pairplot(new_MTB,
vars = ['Average Power', 'Average Speed', 'Average Heart Rate'],
hue = 'Workout Type',
palette = 'husl')
sns.pairplot(new_MTB,
vars = ['Distance', 'Elevation Gain', 'Time Moving', 'TSS'],
hue = 'Workout Type',
palette = 'husl')
Race, Workout and Ride.# Melt DataFrame
melt_MTB = pd.melt(new_MTB,
id_vars=['Workout Type'],
var_name="Vars")
plt.figure(figsize=(10,6)) #plot size
#box plot
sns.boxplot(x = 'Vars',
y = 'value',
data = melt_MTB,
hue = 'Workout Type',
palette = 'husl')
plt.title('Workout Category')
plt.legend(bbox_to_anchor=(1, 1), loc=2)
plt.xticks(rotation=-10)
plt.figure(figsize=(10,6))
sns.swarmplot(x = 'Vars',
y = 'value',
data = melt_MTB,
hue = 'Workout Type',
split = True,
palette = 'husl')
plt.legend(bbox_to_anchor=(1, 1), loc=2)
plt.xticks(rotation=-10)
kudos? Is there anything to indicate which rides are more popular? Explore the relationship between the main variables and kudos. Show a plot and comment on any relationship you observe. plt.figure(figsize=(10,6))
sns.barplot(x = 'kudos',
y = 'distance',
data = not_MTB,
hue = 'workout_type',
palette = 'husl',
ci = None)
plt.legend(bbox_to_anchor=(1, 1), loc=2)
plt.xticks(rotation=-10)
plt.figure(figsize=(10,6))
sns.barplot(x = 'kudos',
y = 'Average Speed',
data = not_MTB,
hue = 'workout_type',
palette = 'husl',
ci = None)
plt.legend(bbox_to_anchor=(1, 1), loc=2)
plt.xticks(rotation=-10)
plt.figure(figsize=(10,6))
sns.barplot(x = 'kudos',
y = 'TSS',
data = not_MTB,
hue = 'workout_type',
palette = 'husl',
ci = None)
plt.legend(bbox_to_anchor=(1, 1), loc=2)
plt.xticks(rotation=-10)
%run scripts/CBtwo
#Display DataFrame
df_DT=pd.DataFrame.from_records(Summary, MY)
df_DT=df_DT.dropna()
df_DT.columns=['Total Distance', 'Total TSS', 'Average Speed']
df_DT
sns.pairplot(df_DT)
func(5,2019)
%matplotlib inline
%run scripts/Portfolio2_util.py
import warnings; warnings.simplefilter('ignore')
energydata.head()
energydata.plot.line(x = 'Month',
y = 'Appliances',
figsize=(10,6),
color = '#2E86C1',
title='Appliances Consumption Per Month',
legend = False)
energydata['Date'] = [d.date() for d in energydata.index]
def func(start, end):
range_date = energydata[start:end]
return range_date.plot.line(y = 'Appliances',
figsize=(10,6),
color = '#2E86C1',
title = 'Frequency of Energy Consumption',
legend = False)
func('2016-01-13', '2016-01-19')
plt.figure(figsize=(10,6))
sns.distplot(energydata['Appliances'],
kde = False,
bins = 80,
color = '#2E86C1').set_title('Frequency of Energy Consumption')
plt.figure(figsize=(10,6))
sns.boxplot(x=energydata['Appliances'],
color = '#2E86C1').set_title('Frequency of Energy Consumption')
%run scripts/PearsonValue.py
g = sns.pairplot(energydata,
vars = ['Appliances', 'lights', 'T1', 'RH_1', 'T2', 'RH_2', 'T3', 'RH_3'],
palette = 'husl',
diag_kind = 'kde',
diag_kws=dict(shade=True))
g.map_upper(corr)
g.map_lower(sns.regplot, line_kws = {'color':'red'})
plt.show()
g = sns.pairplot(energydata,
vars = ['Appliances', 'T4', 'RH_4', 'T5', 'RH_5', 'T6', 'RH_6'],
palette = 'husl',
diag_kind = 'kde',
diag_kws=dict(shade=True))
g.map_upper(corr)
g.map_lower(sns.regplot, line_kws = {'color':'red'})
plt.show()
g = sns.pairplot(energydata,
vars = ['Appliances', 'T_out', 'Press_mm_hg', 'RH_out', 'Windspeed', 'Visibility', 'Tdewpoint', 'NSM','T6'],
palette = 'husl',
diag_kind = 'kde',
diag_kws=dict(shade=True))
g.map_upper(corr)
g.map_lower(sns.regplot, line_kws = {'color':'red'})
plt.show()
heatmap_func(2,3,4,5)
RFE_df
stats_df
Cross Validation per Model, RMSE and R-Sqaured
model_list = [linear_model.LinearRegression(),
RandomForestRegressor(n_estimators=20, random_state=0),
GradientBoostingRegressor(n_estimators=20, learning_rate=0.25, max_depth=13, random_state=0),
SVR(kernel = "rbf")]
model_name = ['Linear Regression', 'Random Forest', 'Gradient Boosting', 'SVR']
X = energydata[all_cols]
y = energydata['Appliances']
cv_rec = []
for i in range(4):
model = model_list[i]
r2_cv = cross_val_score(model, X, y, cv=5, scoring = 'r2').mean()
rmse_cv = cross_val_score(model, X, y, cv=5, scoring = 'neg_mean_squared_error').mean()
cv_rec.append((model_name[i], r2_cv, rmse_cv))
cv_df = pd.DataFrame(cv_rec)
cv_df.columns = ['Model', 'R-Squared', 'RMSE']
cv_df.index = cv_df['Model']
cv_df = cv_df.drop('Model',
axis = 1)
plt.figure(figsize=(5,3))
cv_df['R-Squared'].plot(kind = 'barh',
title='R-Squared',
legend = False)
plt.figure(figsize=(5,3))
cv_df['RMSE'].plot(kind = 'barh',
title='RMSE',
legend = False)
As seen from the graphs below, both models give NSM as the most important variable, but the rest of the varibles differ in importance per model.
X_train_rf = utils.multiclass.type_of_target(X_train.astype('int'))
y_train_rf = utils.multiclass.type_of_target(y_train.astype('int'))
forest = ExtraTreesClassifier(n_estimators=20,
random_state=0)
X = energydata[all_cols]
y = energydata['Appliances']
forest.fit(X, y)
importances = forest.feature_importances_
RF_data = {'Features': all_cols, 'Importance': importances}
RF_df = pd.DataFrame(RF_data)
RF_df = RF_df.sort_values(by = ['Importance'])
plt.figure(figsize=(10,6))
RF_df.plot(kind = 'barh',
x = 'Features',
y = 'Importance',
title='RF Feature Importance',
legend = False)
plt.xlabel('Importance')
from xgboost import XGBClassifier
from xgboost import plot_importance
model = XGBClassifier()
model.fit(X, y)
plot_importance(model)
plt.figure(figsize=(10,6))
K-means clustering is one of the simplest and popular unsupervised learning algorithms. Typically, unsupervised algorithms make inferences from datasets using only input vectors without referring to known, or labelled, outcomes. This notebook illustrates the process of K-means clustering by generating some random clusters of data and then showing the iterations of the algorithm as random cluster means are updated.
We first generate random data around 4 centers.
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline
import copy
center_1 = np.array([1,2])
center_2 = np.array([6,6])
center_3 = np.array([9,1])
center_4 = np.array([-5,-1])
# Generate random data and center it to the four centers each with a different variance
np.random.seed(5)
data_1 = np.random.randn(200,2) * 1.5 + center_1
data_2 = np.random.randn(200,2) * 1 + center_2
data_3 = np.random.randn(200,2) * 0.5 + center_3
data_4 = np.random.randn(200,2) * 0.8 + center_4
data = np.concatenate((data_1, data_2, data_3, data_4), axis = 0)
plt.scatter(data[:,0], data[:,1], s=7, c='k')
plt.show()
You need to generate four random centres.
This part of portfolio should contain at least:
k is set to 4;centres = np.random.randn(k,c)*std + mean where std and mean are the standard deviation and mean of the data. c represents the number of features in the data. Set the random seed to 6.green, blue, yellow, and cyan. Set the edgecolors to red.std = {1: 1.25, 2: 1.75, 3: 1.5, 4: 2}
k = 4
np.random.seed(6)
mean = {i+1: [np.random.randint(1,20), np.random.randint(1,20)]
for i in range(k)}
centres = {i+1: np.random.randn(100,2) * std[i+1] + mean[i+1]
for i in range(k)}
df_centres = pd.DataFrame()
for i in centres.keys():
df_dump = pd.DataFrame(centres[i])
df_centres = df_centres.append(df_dump, ignore_index=True)
df_centres = df_centres.rename(columns = {0: 'x', 1: 'y'})
colmap = {1: 'g', 2: 'b', 3: 'y', 4: 'c'}
for i in mean.keys():
plt.scatter(centres[i][:,0], centres[i][:,1], color = 'k', alpha = 0.2)
plt.scatter(*mean[i], color = colmap[i], edgecolors = 'r')
plt.show()
You need to implement the process of k-means clustering. Implement each iteration as a seperate cell, assigning each data point to the closest centre, then updating the cluster centres based on the data, then plot the new clusters.
def clustering(df_centres, mean):
for j in df_centres.iterrows():
for i in mean.keys():
df_centres['distance_from_{}'.format(i)] = (np.sqrt((df_centres['x'] - mean[i][0])**2 +
(df_centres['y'] - mean[i][1])**2))
mean_dist = ['distance_from_{}'.format(i)
for i in mean.keys()]
df_centres['closest'] = df_centres.loc[:, mean_dist].idxmin(axis=1) #outputs data of min distance > index number
df_centres['closest'] = df_centres['closest'].map(lambda x: int(x.lstrip('distance_from_'))) # assigns index number to closest
df_centres['color'] = df_centres['closest'].map(lambda x:colmap[x])
return df_centres
mean_old = copy.deepcopy(mean)
def update(mean):
for i in mean.keys():
mean[i][0] = np.mean(df_centres[df_centres['closest'] == i]['x'])
mean[i][1] = np.mean(df_centres[df_centres['closest'] == i]['y'])
return mean
df_centres = clustering(df_centres, mean)
plt.scatter(df_centres['x'], df_centres['y'], color = df_centres['color'], alpha = 0.2)
for i in centres.keys():
plt.scatter(*mean[i], color = colmap[i], edgecolors = 'r')
plt.show()
df_mean = pd.DataFrame()
x_diff = 1
y_diff = 1
while x_diff != 0 and y_diff != 0:
mean_old = copy.deepcopy(mean)
mean = update(mean)
mean_dp = pd.DataFrame(mean)
mean_dp = mean_dp.T
mean_dp = mean_dp.rename(columns = {0: 'x', 1: 'y'})
mean_dp['xy'] = mean_dp.apply(lambda x: [x['x'], x['y']], axis=1)
mean_dp = mean_dp.drop(['x', 'y'], axis=1)
mean_dp = mean_dp.T
df_mean = df_mean.append(mean_dp, ignore_index = True)
df_centres = clustering(df_centres, mean)
for i in mean.keys():
x_diff = (mean[i][0] - mean_old[i][0])
y_diff = (mean[i][1] - mean_old[i][1])
plt.scatter(df_centres['x'], df_centres['y'], color = df_centres['color'], alpha = 0.2)
for i in centres.keys():
plt.scatter(*mean[i], color = colmap[i], edgecolors = 'r')
plt.show()
df_mean = df_mean.rename(columns = {1: 'Cluster 1', 2: 'Cluster 2', 3: 'Cluster 3', 4: 'Cluster 4'})
df_mean